SET_DEFAULT_FAST_TRAP(&ed->arch);
+ ed->arch.flags = TF_kernel_mode;
+
if ( d->id == IDLE_DOMAIN_ID )
{
ed->arch.schedule_tail = continue_idle_task;
d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] =
mk_l3_pgentry(__pa(d->arch.mm_perdomain_l2) | __PAGE_HYPERVISOR);
#endif
-
- ed->arch.flags = TF_kernel_mode;
}
}
}
+#ifdef __x86_64__
+
+#define loadsegment(seg,value) ({ \
+ int __r = 1; \
+ __asm__ __volatile__ ( \
+ "1: movl %k1,%%" #seg "\n2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: xorl %k0,%k0\n" \
+ " movl %k0,%%" #seg "\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 8\n" \
+ " .quad 1b,3b\n" \
+ ".previous" \
+ : "=r" (__r) : "r" (value), "0" (__r) );\
+ __r; })
+
+static void switch_segments(
+ struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n)
+{
+ int all_segs_okay = 1;
+
+ if ( !is_idle_task(p->domain) )
+ {
+ __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
+ __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
+ __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
+ __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
+ }
+
+ /* Either selector != 0 ==> reload. */
+ if ( unlikely(p->arch.user_ctxt.ds |
+ n->arch.user_ctxt.ds) )
+ all_segs_okay &= loadsegment(ds, n->arch.user_ctxt.ds);
+
+ /* Either selector != 0 ==> reload. */
+ if ( unlikely(p->arch.user_ctxt.es |
+ n->arch.user_ctxt.es) )
+ all_segs_okay &= loadsegment(es, n->arch.user_ctxt.es);
+
+ /*
+ * Either selector != 0 ==> reload.
+ * Also reload to reset FS_BASE if it was non-zero.
+ */
+ if ( unlikely(p->arch.user_ctxt.fs |
+ p->arch.user_ctxt.fs_base |
+ n->arch.user_ctxt.fs) )
+ {
+ all_segs_okay &= loadsegment(fs, n->arch.user_ctxt.fs);
+ if ( p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
+ p->arch.user_ctxt.fs_base = 0;
+ }
+
+ /*
+ * Either selector != 0 ==> reload.
+ * Also reload to reset GS_BASE if it was non-zero.
+ */
+ if ( unlikely(p->arch.user_ctxt.gs |
+ p->arch.user_ctxt.gs_base_user |
+ n->arch.user_ctxt.gs) )
+ {
+ /* Reset GS_BASE with user %gs? */
+ if ( p->arch.user_ctxt.gs || !n->arch.user_ctxt.gs_base_user )
+ all_segs_okay &= loadsegment(gs, n->arch.user_ctxt.gs);
+ if ( p->arch.user_ctxt.gs ) /* != 0 selector kills gs_base_user */
+ p->arch.user_ctxt.gs_base_user = 0;
+ }
+
+ /* This can only be non-zero if selector is NULL. */
+ if ( n->arch.user_ctxt.fs_base )
+ wrmsr(MSR_FS_BASE,
+ n->arch.user_ctxt.fs_base,
+ n->arch.user_ctxt.fs_base>>32);
+
+ /* This can only be non-zero if selector is NULL. */
+ if ( n->arch.user_ctxt.gs_base_user )
+ wrmsr(MSR_GS_BASE,
+ n->arch.user_ctxt.gs_base_user,
+ n->arch.user_ctxt.gs_base_user>>32);
+
+ /* This can only be non-zero if selector is NULL. */
+ if ( p->arch.user_ctxt.gs_base_kernel |
+ n->arch.user_ctxt.gs_base_kernel )
+ wrmsr(MSR_SHADOW_GS_BASE,
+ n->arch.user_ctxt.gs_base_kernel,
+ n->arch.user_ctxt.gs_base_kernel>>32);
+
+ /* If in kernel mode then switch the GS bases around. */
+ if ( n->arch.flags & TF_kernel_mode )
+ __asm__ __volatile__ ( "swapgs" );
+
+ if ( unlikely(!all_segs_okay) )
+ {
+ unsigned long *rsp =
+ (n->arch.flags & TF_kernel_mode) ?
+ (unsigned long *)regs->rsp :
+ (unsigned long *)n->arch.kernel_sp;
+
+ if ( put_user(regs->ss, rsp- 1) |
+ put_user(regs->rsp, rsp- 2) |
+ put_user(regs->rflags, rsp- 3) |
+ put_user(regs->cs, rsp- 4) |
+ put_user(regs->rip, rsp- 5) |
+ put_user(regs->gs, rsp- 6) |
+ put_user(regs->fs, rsp- 7) |
+ put_user(regs->es, rsp- 8) |
+ put_user(regs->ds, rsp- 9) |
+ put_user(regs->r11, rsp-10) |
+ put_user(regs->rcx, rsp-11) )
+ {
+ DPRINTK("Error while creating failsafe callback frame.\n");
+ domain_crash();
+ }
+
+ if ( !(n->arch.flags & TF_kernel_mode) )
+ {
+ n->arch.flags |= TF_kernel_mode;
+ __asm__ __volatile__ ( "swapgs" );
+ write_ptbase(n);
+ }
+
+ regs->entry_vector = TRAP_syscall;
+ regs->rflags &= 0xFFFCBEFFUL;
+ regs->ss = __GUEST_SS;
+ regs->rsp = (unsigned long)(rsp-11);
+ regs->cs = __GUEST_CS;
+ regs->rip = n->arch.failsafe_address;
+ }
+}
+
+long do_switch_to_user(void)
+{
+ struct xen_regs *regs = get_execution_context();
+ struct switch_to_user stu;
+ struct exec_domain *ed = current;
+
+ if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) )
+ return -EFAULT;
+
+ ed->arch.flags &= ~TF_kernel_mode;
+ __asm__ __volatile__ ( "swapgs" );
+ write_ptbase(ed);
+
+ regs->rip = stu.rip;
+ regs->cs = stu.cs;
+ regs->rflags = stu.rflags;
+ regs->rsp = stu.rsp;
+ regs->ss = stu.ss;
+
+ if ( !(stu.flags & ECF_IN_SYSCALL) )
+ {
+ regs->entry_vector = 0;
+ regs->r11 = stu.r11;
+ regs->rcx = stu.rcx;
+ }
+
+ return regs->rax;
+}
+
+#elif defined(__i386__)
+
+#define switch_segments(_r, _p, _n) ((void)0)
+
+#endif
+
/*
* This special macro can be used to load a debugging register
*/
#ifdef CONFIG_VMX
unsigned long vmx_domain = next_p->arch.arch_vmx.flags;
#endif
-#ifdef __x86_64__
- int all_segs_okay = 1;
-#endif
__cli();
/* Switch guest general-register state. */
if ( !is_idle_task(prev_p->domain) )
{
-#ifdef __x86_64__
- __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (stack_ec->ds) );
- __asm__ __volatile__ ( "movl %%es,%0" : "=m" (stack_ec->es) );
- __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (stack_ec->fs) );
- __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (stack_ec->gs) );
-#endif
memcpy(&prev_p->arch.user_ctxt,
stack_ec,
sizeof(*stack_ec));
SET_FAST_TRAP(&next_p->arch);
#ifdef __i386__
- /* Switch the guest OS ring-1 stack. */
+ /* Switch the kernel ring-1 stack. */
tss->esp1 = next_p->arch.kernel_sp;
tss->ss1 = next_p->arch.kernel_ss;
#endif
__sti();
-#ifdef __x86_64__
-
-#define loadsegment(seg,value) ({ \
- int __r = 1; \
- __asm__ __volatile__ ( \
- "1: movl %k1,%%" #seg "\n2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: xorl %k0,%k0\n" \
- " movl %k0,%%" #seg "\n" \
- " jmp 2b\n" \
- ".previous\n" \
- ".section __ex_table,\"a\"\n" \
- " .align 8\n" \
- " .quad 1b,3b\n" \
- ".previous" \
- : "=r" (__r) : "r" (value), "0" (__r) );\
- __r; })
-
- /* Either selector != 0 ==> reload. */
- if ( unlikely(prev_p->arch.user_ctxt.ds) ||
- unlikely(next_p->arch.user_ctxt.ds) )
- all_segs_okay &= loadsegment(ds, next_p->arch.user_ctxt.ds);
-
- /* Either selector != 0 ==> reload. */
- if ( unlikely(prev_p->arch.user_ctxt.es) ||
- unlikely(next_p->arch.user_ctxt.es) )
- all_segs_okay &= loadsegment(es, next_p->arch.user_ctxt.es);
-
- /*
- * Either selector != 0 ==> reload.
- * Also reload to reset FS_BASE if it was non-zero.
- */
- if ( unlikely(prev_p->arch.user_ctxt.fs) ||
- unlikely(prev_p->arch.user_ctxt.fs_base) ||
- unlikely(next_p->arch.user_ctxt.fs) )
- {
- all_segs_okay &= loadsegment(fs, next_p->arch.user_ctxt.fs);
- if ( prev_p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
- prev_p->arch.user_ctxt.fs_base = 0;
- }
-
- /*
- * Either selector != 0 ==> reload.
- * Also reload to reset GS_BASE if it was non-zero.
- */
- if ( unlikely(prev_p->arch.user_ctxt.gs) ||
- unlikely(prev_p->arch.user_ctxt.gs_base_os) ||
- unlikely(prev_p->arch.user_ctxt.gs_base_app) ||
- unlikely(next_p->arch.user_ctxt.gs) )
- {
- /* Reset GS_BASE with user %gs. */
- all_segs_okay &= loadsegment(gs, next_p->arch.user_ctxt.gs);
- /* Reset KERNEL_GS_BASE if we won't be doing it later. */
- if ( !next_p->arch.user_ctxt.gs_base_os )
- wrmsr(MSR_KERNEL_GS_BASE, 0, 0);
- if ( prev_p->arch.user_ctxt.gs ) /* != 0 selector kills app gs_base */
- prev_p->arch.user_ctxt.gs_base_app = 0;
- }
-
- /* This can only be non-zero if selector is NULL. */
- if ( next_p->arch.user_ctxt.fs_base )
- wrmsr(MSR_FS_BASE,
- next_p->arch.user_ctxt.fs_base,
- next_p->arch.user_ctxt.fs_base>>32);
-
- /* This can only be non-zero if selector is NULL. */
- if ( next_p->arch.user_ctxt.gs_base_os )
- wrmsr(MSR_KERNEL_GS_BASE,
- next_p->arch.user_ctxt.gs_base_os,
- next_p->arch.user_ctxt.gs_base_os>>32);
-
- /* This can only be non-zero if selector is NULL. */
- if ( next_p->arch.user_ctxt.gs_base_app )
- wrmsr(MSR_GS_BASE,
- next_p->arch.user_ctxt.gs_base_app,
- next_p->arch.user_ctxt.gs_base_app>>32);
-
- /* If in guest-OS mode, switch the GS bases around. */
- if ( next_p->arch.flags & TF_kernel_mode )
- __asm__ __volatile__ ( "swapgs" );
-
- if ( unlikely(!all_segs_okay) )
- {
- unsigned long *rsp =
- (next_p->arch.flags & TF_kernel_mode) ?
- (unsigned long *)stack_ec->rsp :
- (unsigned long *)next_p->arch.kernel_sp;
-
- if ( put_user(stack_ec->ss, rsp- 1) |
- put_user(stack_ec->rsp, rsp- 2) |
- put_user(stack_ec->rflags, rsp- 3) |
- put_user(stack_ec->cs, rsp- 4) |
- put_user(stack_ec->rip, rsp- 5) |
- put_user(stack_ec->gs, rsp- 6) |
- put_user(stack_ec->fs, rsp- 7) |
- put_user(stack_ec->es, rsp- 8) |
- put_user(stack_ec->ds, rsp- 9) |
- put_user(stack_ec->r11, rsp-10) |
- put_user(stack_ec->rcx, rsp-11) )
- {
- DPRINTK("Error while creating failsafe callback frame.\n");
- domain_crash();
- }
-
- if ( !(next_p->arch.flags & TF_kernel_mode) )
- {
- next_p->arch.flags |= TF_kernel_mode;
- __asm__ __volatile__ ( "swapgs" );
- /* XXX switch page tables XXX */
- }
-
- stack_ec->entry_vector = TRAP_syscall;
- stack_ec->rflags &= 0xFFFCBEFFUL;
- stack_ec->ss = __GUEST_SS;
- stack_ec->rsp = (unsigned long)(rsp-11);
- stack_ec->cs = __GUEST_CS;
- stack_ec->rip = next_p->arch.failsafe_address;
- }
-
-#endif /* __x86_64__ */
+ switch_segments(stack_ec, prev_p, next_p);
}
/* Exit shadow mode before deconstructing final guest page table. */
shadow_mode_disable(d);
- /* Drop the in-use reference to the page-table base. */
+ /* Drop the in-use references to page-table bases. */
for_each_exec_domain ( d, ed )
{
if ( pagetable_val(ed->arch.pagetable) != 0 )
- put_page_and_type(&frame_table[pagetable_val(ed->arch.pagetable) >>
- PAGE_SHIFT]);
- ed->arch.pagetable = mk_pagetable(0);
+ {
+ put_page_and_type(
+ &frame_table[pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT]);
+ ed->arch.pagetable = mk_pagetable(0);
+ }
+
+ if ( pagetable_val(ed->arch.pagetable_user) != 0 )
+ {
+ put_page_and_type(
+ &frame_table[pagetable_val(ed->arch.pagetable_user) >>
+ PAGE_SHIFT]);
+ ed->arch.pagetable_user = mk_pagetable(0);
+ }
}
#ifdef CONFIG_VMX
#else
if ( unlikely(shadow_mode(d)) )
pa = pagetable_val(ed->arch.shadow_table);
+#ifdef __x86_64__
+ else if ( !(ed->arch.flags & TF_kernel_mode) )
+ pa = pagetable_val(ed->arch.pagetable_user);
+#endif
else
pa = pagetable_val(ed->arch.pagetable);
#endif
okay = new_guest_cr3(pfn);
break;
+#ifdef __x86_64__
+ case MMUEXT_NEW_USER_BASEPTR:
+ okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
+ if ( unlikely(!okay) )
+ {
+ MEM_LOG("Error while installing new baseptr %p", pfn);
+ }
+ else
+ {
+ unsigned long old_pfn =
+ pagetable_val(ed->arch.pagetable_user) >> PAGE_SHIFT;
+ ed->arch.pagetable_user = mk_pagetable(pfn << PAGE_SHIFT);
+ if ( old_pfn != 0 )
+ put_page_and_type(&frame_table[old_pfn]);
+ }
+ break;
+#endif
+
case MMUEXT_TLB_FLUSH:
percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
break;
memset(t->io_bitmap, ~0, sizeof(t->io_bitmap));
#if defined(__i386__)
t->ss0 = __HYPERVISOR_DS;
- t->esp0 = get_stack_top();
+ t->esp0 = get_stack_bottom();
#elif defined(__x86_64__)
- t->rsp0 = get_stack_top();
+ t->rsp0 = get_stack_bottom();
#endif
set_tss_desc(nr,t);
load_TR(nr);
{
__vmwrite(HOST_CR3, pagetable_val(d->arch.monitor_table));
__vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table));
- __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+ __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom());
if (event_pending(d)) {
if (test_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_pending[0]))
ed->arch.shadow_table = ed->arch.pagetable;
__vmwrite(GUEST_CR3, pagetable_val(ed->arch.pagetable));
__vmwrite(HOST_CR3, pagetable_val(ed->arch.monitor_table));
- __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+ __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom());
ed->arch.schedule_tail = arch_vmx_do_resume;
}
* (9) u32 fs;
* (8) u32 ds;
* (7) u32 es;
- * <- get_stack_top() (= HOST_ESP)
+ * <- get_stack_bottom() (= HOST_ESP)
* (6) u32 ss;
* (5) u32 esp;
* (4) u32 eflags;
* (2) u32 eip;
* (2/1) u16 entry_vector;
* (1/1) u16 error_code;
- * However, get_stack_top() acturally returns 20 bytes below the real
- * top of the stack to allow space for:
+ * However, get_stack_bottom() actually returns 20 bytes before the real
+ * bottom of the stack to allow space for:
* domain pointer, DS, ES, FS, GS. Therefore, we effectively skip 6 registers.
*/
#define VMX_MONITOR_EFLAGS 0x202 /* IF on */
ALIGN
restore_all_guest:
- testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
- jnz failsafe_callback
+ btr $_TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
+ jc failsafe_callback
testl $X86_EFLAGS_VM,XREGS_eflags(%esp)
jnz restore_all_vm86
FLT1: movl XREGS_ds(%esp),%ds
DBLFLT1:GET_CURRENT(%ebx)
jmp test_all_events
DBLFIX1:GET_CURRENT(%ebx)
- testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
- jnz domain_crash # cannot reenter failsafe code
- orb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
+ bts $_TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
+ jc domain_crash # cannot reenter failsafe code
jmp test_all_events # will return via failsafe code
.previous
.section __pre_ex_table,"a"
/* No special register assumptions */
failsafe_callback:
GET_CURRENT(%ebx)
- andb $~TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
leal EDOMAIN_trap_bounce(%ebx),%edx
movl EDOMAIN_failsafe_addr(%ebx),%eax
movl %eax,TRAPBOUNCE_eip(%edx)
GET_CURRENT(%ebx)
andl $(NR_hypercalls-1),%eax
call *SYMBOL_NAME(hypercall_table)(,%eax,4)
-
-ret_from_hypercall:
movl %eax,XREGS_eax(%esp) # save the return value
test_all_events:
ALIGN
restore_all_guest:
- testb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
- jnz failsafe_callback
+ btr $_TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
+ jc failsafe_callback
RESTORE_ALL
testw $TRAP_syscall,4(%rsp)
jz 1f
DBLFLT1:GET_CURRENT(%rbx)
jmp test_all_events
DBLFIX1:GET_CURRENT(%rbx)
- testb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
- jnz domain_crash # cannot reenter failsafe code
- orb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
+ bts $_TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
+ jc domain_crash # cannot reenter failsafe code
jmp test_all_events # will return via failsafe code
.previous
.section __pre_ex_table,"a"
/* No special register assumptions */
failsafe_callback:
GET_CURRENT(%rbx)
- andb $~TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
leaq EDOMAIN_trap_bounce(%rbx),%rdx
movq EDOMAIN_failsafe_addr(%rbx),%rax
movq %rax,TRAPBOUNCE_eip(%rdx)
* NB. We must move %r10 to %rcx for C function-calling ABI.
*/
ALIGN
-ENTRY(hypercall)
- sti
+ENTRY(syscall_enter)
movl $__GUEST_SS,8(%rsp)
pushq %r11
pushq $__GUEST_CS
pushq $0
movl $TRAP_syscall,4(%rsp)
SAVE_ALL
- movq %r10,%rcx
- andq $(NR_hypercalls-1),%rax
- leaq SYMBOL_NAME(hypercall_table)(%rip),%rbx
- callq *(%rbx,%rax,8)
GET_CURRENT(%rbx)
+ bts $_TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
+ jc hypercall
+ swapgs
+ movq %rbx,%rdi
+ call SYMBOL_NAME(write_ptbase)
+ jmp restore_all_guest
-ret_from_hypercall:
+hypercall:
+ sti
+ movq %r10,%rcx
+ andq $(NR_hypercalls-1),%rax
+ leaq SYMBOL_NAME(hypercall_table)(%rip),%r10
+ callq *(%r10,%rax,8)
movq %rax,XREGS_rax(%rsp) # save the return value
test_all_events:
movq XREGS_rsp+8(%rsp),%rsi
testb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
jnz 1f
- /* Push new frame at registered guest-OS stack top. */
+ /* Push new frame at registered guest-OS stack base. */
movq EDOMAIN_kernel_sp(%rbx),%rsi
1: movq $HYPERVISOR_VIRT_START,%rax
cmpq %rax,%rsi
/* Rewrite our stack frame and return to guest-OS mode. */
/* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */
movb $0,TRAPBOUNCE_flags(%rdx)
- testb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
- jnz 1f
- orb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
+ bts $_TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
+ jc 1f
swapgs
- /* XXX switch page tables XXX */
+ movq %rbx,%rdi
+ call SYMBOL_NAME(write_ptbase)
1: movl $TRAP_syscall,XREGS_entry_vector+8(%rsp)
andl $0xfffcbeff,XREGS_eflags+8(%rsp)
movl $__GUEST_SS,XREGS_ss+8(%rsp)
.quad SYMBOL_NAME(do_set_debugreg)
.quad SYMBOL_NAME(do_get_debugreg)
.quad SYMBOL_NAME(do_update_descriptor) /* 10 */
- .quad SYMBOL_NAME(do_ni_hypercall) # do_set_fast_trap
+ .quad SYMBOL_NAME(do_ni_hypercall)
.quad SYMBOL_NAME(do_dom_mem_op)
.quad SYMBOL_NAME(do_multicall)
.quad SYMBOL_NAME(do_update_va_mapping)
.quad SYMBOL_NAME(do_grant_table_op) /* 20 */
.quad SYMBOL_NAME(do_vm_assist)
.quad SYMBOL_NAME(do_update_va_mapping_otherdomain)
- .quad SYMBOL_NAME(do_ni_hypercall) # do_switch_vm86
+ .quad SYMBOL_NAME(do_switch_to_user)
.quad SYMBOL_NAME(do_boot_vcpu)
+ .quad SYMBOL_NAME(do_set_segment_base) /* 25 */
.rept NR_hypercalls-((.-hypercall_table)/4)
.quad SYMBOL_NAME(do_ni_hypercall)
.endr
#include <asm/page.h>
#include <asm/flushtlb.h>
#include <asm/fixmap.h>
-#include <asm/domain_page.h>
+#include <asm/msr.h>
void *safe_page_alloc(void)
{
return 0;
}
+long do_set_segment_base(unsigned int which, unsigned long base)
+{
+ struct exec_domain *ed = current;
+
+ switch ( which )
+ {
+ case SEGBASE_FS:
+ ed->arch.user_ctxt.fs_base = base;
+ wrmsr(MSR_FS_BASE, base, base>>32);
+ break;
+
+ case SEGBASE_GS_USER:
+ ed->arch.user_ctxt.gs_base_user = base;
+ wrmsr(MSR_SHADOW_GS_BASE, base, base>>32);
+ break;
+
+ case SEGBASE_GS_KERNEL:
+ ed->arch.user_ctxt.gs_base_kernel = base;
+ wrmsr(MSR_GS_BASE, base, base>>32);
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Returns TRUE if given descriptor is valid for GDT or LDT. */
int check_descriptor(struct desc_struct *d)
__asm__ __volatile__ ( "hlt" );
}
-asmlinkage void hypercall(void);
+asmlinkage void syscall_enter(void);
void __init percpu_traps_init(void)
{
- char *stack_top = (char *)get_stack_top();
- char *stack = (char *)((unsigned long)stack_top & ~(STACK_SIZE - 1));
- int cpu = smp_processor_id();
+ char *stack_bottom, *stack;
+ int cpu = smp_processor_id();
+
+ stack_bottom = (char *)get_stack_bottom();
+ stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
/* Double-fault handler has its own per-CPU 1kB stack. */
init_tss[cpu].ist[0] = (unsigned long)&stack[1024];
stack[0] = 0x48;
stack[1] = 0x89;
stack[2] = 0x25;
- *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16;
+ *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
/* leaq saversp(%rip), %rsp */
stack[7] = 0x48;
stack[8] = 0x8d;
stack[9] = 0x25;
- *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16;
+ *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
- /* jmp hypercall */
+ /* jmp syscall_enter */
stack[14] = 0xe9;
- *(u32 *)&stack[15] = (char *)hypercall - &stack[19];
+ *(u32 *)&stack[15] = (char *)syscall_enter - &stack[19];
/*
* Trampoline for SYSCALL entry from compatibility mode.
stack[0] = 0x48;
stack[1] = 0x89;
stack[2] = 0x25;
- *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16;
+ *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
/* leaq saversp(%rip), %rsp */
stack[7] = 0x48;
stack[8] = 0x8d;
stack[9] = 0x25;
- *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16;
+ *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
- /* jmp hypercall */
+ /* jmp syscall_enter */
stack[14] = 0xe9;
- *(u32 *)&stack[15] = (char *)hypercall - &stack[19];
+ *(u32 *)&stack[15] = (char *)syscall_enter - &stack[19];
/*
* Common SYSCALL parameters.
*/
l1_pgentry_t *perdomain_ptes;
pagetable_t pagetable;
+ pagetable_t pagetable_user; /* x86/64: user-space pagetable. */
pagetable_t monitor_table;
pagetable_t phys_table; /* 1:1 pagetable */
#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
#define MSR_FS_BASE 0xc0000100 /* 64bit GS base */
#define MSR_GS_BASE 0xc0000101 /* 64bit FS base */
-#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */
+#define MSR_SHADOW_GS_BASE 0xc0000102 /* SwapGS GS shadow */
/* EFER bits: */
#define _EFER_SCE 0 /* SYSCALL/SYSRET */
#define _EFER_LME 8 /* Long mode enable */
#define TBF_FAILSAFE 16
/* arch_exec_domain' flags values */
-#define TF_failsafe_return 1
-#define TF_kernel_mode 2
+#define _TF_failsafe_return 0
+#define _TF_kernel_mode 1
+#define TF_failsafe_return (1<<_TF_failsafe_return)
+#define TF_kernel_mode (1<<_TF_kernel_mode)
#ifndef __ASSEMBLY__
}
/*
- * Get the top-of-stack, as stored in the per-CPU TSS. This is actually
- * 20 bytes below the real top of the stack to allow space for:
+ * Get the bottom-of-stack, as stored in the per-CPU TSS. This is actually
+ * 20 bytes before the real bottom of the stack to allow space for:
* domain pointer, DS, ES, FS, GS.
*/
-static inline unsigned long get_stack_top(void)
+static inline unsigned long get_stack_bottom(void)
{
unsigned long p;
__asm__ ( "andl %%esp,%0; addl %2,%0"
}
/*
- * Get the top-of-stack, as stored in the per-CPU TSS. This is actually
- * 64 bytes below the real top of the stack to allow space for:
+ * Get the bottom-of-stack, as stored in the per-CPU TSS. This is actually
+ * 64 bytes before the real bottom of the stack to allow space for:
* domain pointer, DS, ES, FS, GS, FS_BASE, GS_BASE_OS, GS_BASE_APP
*/
-static inline unsigned long get_stack_top(void)
+static inline unsigned long get_stack_bottom(void)
{
unsigned long p;
__asm__ ( "orq %%rsp,%0; andq $~7,%0"
#define HYPERVISOR_VIRT_END (0xFFFF880000000000UL)
#endif
+#ifndef __ASSEMBLY__
+
/* The machine->physical mapping table starts at this address, read-only. */
#ifndef machine_to_phys_mapping
#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
#endif
-#ifndef __ASSEMBLY__
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+
+/*
+ * int HYPERVISOR_switch_to_user(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * If flags contains ECF_IN_SYSCALL:
+ * Restore RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+struct switch_to_user {
+ /* Top of stack (%rsp at point of hypercall). */
+ u64 r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of switch_to_user stack frame. */
+} PACKED;
/* NB. Both the following are 64 bits each. */
typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */
u64 fs; /* Non-zero => takes precedence over fs_base. */
u64 gs; /* Non-zero => takes precedence over gs_base_app. */
u64 fs_base;
- u64 gs_base_os;
- u64 gs_base_app;
+ u64 gs_base_kernel;
+ u64 gs_base_user;
} PACKED execution_context_t;
typedef u64 tsc_timestamp_t; /* RDTSC timestamp */
#define __HYPERVISOR_set_debugreg 8
#define __HYPERVISOR_get_debugreg 9
#define __HYPERVISOR_update_descriptor 10
-#define __HYPERVISOR_set_fast_trap 11
+#define __HYPERVISOR_set_fast_trap 11 /* x86/32 only */
#define __HYPERVISOR_dom_mem_op 12
#define __HYPERVISOR_multicall 13
#define __HYPERVISOR_update_va_mapping 14
#define __HYPERVISOR_grant_table_op 20
#define __HYPERVISOR_vm_assist 21
#define __HYPERVISOR_update_va_mapping_otherdomain 22
-#define __HYPERVISOR_switch_vm86 23
+#define __HYPERVISOR_switch_vm86 23 /* x86/32 only */
+#define __HYPERVISOR_switch_to_user 23 /* x86/64 only */
#define __HYPERVISOR_boot_vcpu 24
+#define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
/*
* MULTICALLS
* val[7:0] == MMUEXT_NEW_BASEPTR:
* ptr[:2] -- Machine address of new page-table base to install in MMU.
*
+ * val[7:0] == MMUEXT_NEW_USER_BASEPTR: [x86/64 only]
+ * ptr[:2] -- Machine address of new page-table base to install in MMU
+ * when in user space.
+ *
* val[7:0] == MMUEXT_TLB_FLUSH:
* No additional arguments.
*
#define MMUEXT_CLEAR_FOREIGNDOM 11
#define MMUEXT_TRANSFER_PAGE 12 /* ptr = MA of frame; val[31:16] = dom */
#define MMUEXT_REASSIGN_PAGE 13
+#define MMUEXT_NEW_USER_BASEPTR 14
#define MMUEXT_CMD_MASK 255
#define MMUEXT_CMD_SHIFT 8